1 Setup

suppressPackageStartupMessages({
  library(data.table)
  library(DESeq2)
  library(gplots)
  library(here)
  library(hyperSpec)
  library(parallel)
  library(pander)
  library(plotly)
  library(RColorBrewer)
  library(scatterplot3d)
  library(tidyverse)
  library(tximport)
  library(vsn)
})
source(here("UPSCb-common/src/R/featureSelection.R"))
pal <- brewer.pal(8,"Dark2")
hpal <- colorRampPalette(c("blue","white","red"))(100)
mar <- par("mar")
samples <- read_csv(here("doc/variables.csv"),
                      col_types=cols(
                        col_character(),
                        col_factor(),
                        col_factor(),
                        col_factor(),
                        col_factor()
                      ))

2 Raw data

tx2gene translation table

tx2gene <- suppressMessages(read_delim(here("reference/annotation/tx2gene.txt"),delim="\t",
                                       col_names=c("TXID","GENE")))

3 Raw data

filelist <- list.files(here("data/salmon"), 
                       recursive = TRUE, 
                       pattern = "quant.sf",
                       full.names = TRUE)

Sanity check to ensure that the data is sorted according to the sample info

names(filelist) <- sub("_S\\d+.*","",basename(dirname(filelist)))
stopifnot(all(samples$ID==names(filelist)))

Read the expression at the gene level

This gives us directly gene counts

counts <- suppressMessages(round(tximport(files = filelist, 
                                          type = "salmon",
                                          tx2gene=tx2gene)$counts))

This is for transcript counts

tx <- suppressMessages(tximport(files = filelist, 
                                      type = "salmon",
                                      txOut = TRUE))
tx.counts <- round(tx$counts)

# counts <- summarizeToGene(tx,tx2gene=tx2gene)

3.1 Quality Control

  • Check how many transcripts are never expressed
sel <- rowSums(tx.counts) == 0
sprintf("%s%% percent (%s) of %s transcripts are not expressed",
        round(sum(sel) * 100/ nrow(tx.counts),digits=1),
        sum(sel),
        nrow(tx.counts))
## [1] "14.9% percent (7940) of 53467 transcripts are not expressed"
  • Let us take a look at the sequencing depth, colouring by TISSUE
dat <- tibble(x=colnames(tx.counts),y=colSums(tx.counts)) %>% 
  bind_cols(samples)

ggplot(dat,aes(x,y,fill=TISSUE)) + geom_col() + 
  scale_y_continuous(name="reads") +
  theme(axis.text.x=element_text(angle=90,size=4),axis.title.x=element_blank())

  • Display the per-transcript mean expression

i.e. the mean raw count of every transcript across samples is calculated and displayed on a log10 scale.

The cumulative transcript coverage is as expected

ggplot(data.frame(value=log10(rowMeans(tx.counts))),aes(x=value)) + 
  geom_density() + ggtitle("transcript mean raw tx.counts distribution") +
  scale_x_continuous(name="mean raw counts (log10)")
## Warning: Removed 7940 rows containing non-finite values (stat_density).

The same is done for the individual samples colored by TISSUE

dat <- as.data.frame(log10(tx.counts)) %>% utils::stack() %>% 
  mutate(TISSUE=samples$TISSUE[match(ind,samples$ID)]) %>% 
  mutate(TIME=samples$TIME[match(ind,samples$ID)]) %>% 
  mutate(TREATMENT=samples$TREATMENT[match(ind,samples$ID)])

ggplot(dat,aes(x=values,group=ind,col=TISSUE)) + 
  geom_density() + ggtitle("sample raw counts distribution") +
  scale_x_continuous(name="per transcript raw counts (log10)")
## Warning: Removed 661300 rows containing non-finite values (stat_density).

3.2 Export

dir.create(here("data/analysis/salmon"),showWarnings=FALSE,recursive=TRUE)
write_csv(as.data.frame(tx.counts) %>% rownames_to_column("ID"),
          here("data/analysis/salmon/raw-unormalised-transcript-expression_data.csv"))

4 Data normalisation

4.1 Preparation

For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate.

dds <- DESeqDataSetFromMatrix(
  countData = tx.counts,
  colData = samples,
  design = ~ TISSUE * CONDITION)
## converting counts to integer mode

4.2 Remove bad sample

dds <- dds[,-match("P14065_119",colnames(dds))]

save(dds,file=here("data/analysis/salmon/dds-transcripts.rda"))

Check the size factors (i.e. the sequencing library size effect)

dds <- estimateSizeFactors(dds)
sizes <- sizeFactors(dds)
pander(sizes)
Table continues below
P14065_101 P14065_102 P14065_103 P14065_104 P14065_105 P14065_106
0.9059 0.5975 0.7959 0.7663 1.074 0.9132
Table continues below
P14065_107 P14065_108 P14065_109 P14065_110 P14065_111 P14065_112
1.131 0.5848 0.7795 0.7331 0.4832 1.034
Table continues below
P14065_113 P14065_114 P14065_115 P14065_116 P14065_117 P14065_118
1.043 0.9788 0.6982 1.778 1.527 0.9935
Table continues below
P14065_120 P14065_121 P14065_122 P14065_123 P14065_124 P14065_125
1.158 1.857 0.9216 0.9973 1.959 1.693
P14065_126 P14065_127 P14065_128 P14065_129 P14065_130
1.079 1.081 1.135 1.317 1.151
boxplot(sizes, main="Sequencing libraries size factor")
abline(h=1,lty=2,col="grey")

4.3 Variance Stabilising Transformation

vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
vst <- assay(vsd)
vst <- vst - min(vst)
  • Validation

The variance stabilisation worked adequately

meanSdPlot(vst[rowSums(vst)>0,])

4.4 QC on the normalised data

4.4.1 PCA

pc <- prcomp(t(vst))
percent <- round(summary(pc)$importance[2,]*100)
  • Cumulative components effect

We define the number of variable of the model

nvar=2

An the number of possible combinations

nlevel=nlevels(dds$TISSUE) * nlevels(dds$CONDITION)

We plot the percentage explained by the different components, the red line represent the number of variable in the model, the orange line the number of variable combinations.

ggplot(tibble(x=1:length(percent),y=cumsum(percent)),aes(x=x,y=y)) +
  geom_line() + scale_y_continuous("variance explained (%)",limits=c(0,100)) +
  scale_x_continuous("Principal component") + 
  geom_vline(xintercept=nvar,colour="red",linetype="dashed",size=0.5) + 
  geom_hline(yintercept=cumsum(percent)[nvar],colour="red",linetype="dashed",size=0.5) +
  geom_vline(xintercept=nlevel,colour="orange",linetype="dashed",size=0.5) + 
  geom_hline(yintercept=cumsum(percent)[nlevel],colour="orange",linetype="dashed",size=0.5)

4.4.2 3 first dimensions

The PCA shows that a large fraction of the variance is explained by both variables.

scatterplot3d(pc$x[,1],
              pc$x[,2],
              pc$x[,3],
              xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
              ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
              zlab=paste("Comp. 3 (",percent[3],"%)",sep=""),
              color=pal[as.integer(dds$CONDITION)],
              pch=c(17,19)[as.integer(dds$TISSUE)])
legend("topleft",
       fill=pal[1:nlevels(dds$CONDITION)],
       legend=levels(dds$CONDITION))

legend("topright",
       pch=c(17,19),
       legend=levels(dds$TISSUE))

par(mar=mar)

4.4.3 2D

pc.dat <- bind_cols(PC1=pc$x[,1],
                    PC2=pc$x[,2],
                    as.data.frame(colData(dds)))

The most variance comes from the tissues. In the leaf, the time has more effect than the treatment, while this look the opposite in the apex.

p <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=CONDITION,shape=TISSUE,text=ID)) + 
  geom_point(size=2) + 
  ggtitle("Principal Component Analysis",subtitle="variance stabilized tx.counts")

ggplotly(p) %>% 
  layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
         yaxis=list(title=paste("PC2 (",percent[2],"%)",sep="")))

4.4.4 Heatmap

Filter for noise

conds <- factor(paste(dds$TISSUE,dds$CONDITION))
sels <- rangeFeatureSelect(counts=vst,
                           conditions=conds,
                           nrep=2)

vst.cutoff <- 2
  • Heatmap of “all” transcripts
hm <- heatmap.2(t(scale(t(vst[sels[[vst.cutoff+1]],]))),
          distfun=pearson.dist,
          hclustfun=function(X){hclust(X,method="ward.D2")},
          labRow = NA,trace = "none",
          labCol = conds,
          col=hpal)

plot(as.hclust(hm$colDendrogram),xlab="",sub="")

4.5 Conclusion

The main differences in the leaf samples are according to the days of sampling. There are no obvious differences at day 3 between treatments but it looks different at day 1 with higher differences. In the case of the apex, the bigger differences appear at day 3. There is a clear separation at day 1 too but day 0 appear grouped close to at 1

5 Session Info

## R version 3.6.2 (2019-12-12)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
##  [1] grid      parallel  stats4    stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] vsn_3.54.0                  tximport_1.14.0            
##  [3] forcats_0.4.0               stringr_1.4.0              
##  [5] dplyr_0.8.4                 purrr_0.3.3                
##  [7] readr_1.3.1                 tidyr_1.0.2                
##  [9] tibble_2.1.3                tidyverse_1.3.0            
## [11] scatterplot3d_0.3-41        RColorBrewer_1.1-2         
## [13] plotly_4.9.2                pander_0.6.3               
## [15] hyperSpec_0.99-20200213     xml2_1.2.2                 
## [17] ggplot2_3.2.1               lattice_0.20-38            
## [19] here_0.1                    gplots_3.0.1.2             
## [21] DESeq2_1.26.0               SummarizedExperiment_1.16.1
## [23] DelayedArray_0.12.2         BiocParallel_1.20.1        
## [25] matrixStats_0.55.0          Biobase_2.46.0             
## [27] GenomicRanges_1.38.0        GenomeInfoDb_1.22.0        
## [29] IRanges_2.20.2              S4Vectors_0.24.3           
## [31] BiocGenerics_0.32.0         data.table_1.12.8          
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1           backports_1.1.5        Hmisc_4.3-1           
##   [4] lazyeval_0.2.2         splines_3.6.2          crosstalk_1.0.0       
##   [7] digest_0.6.24          htmltools_0.4.0        gdata_2.18.0          
##  [10] fansi_0.4.1            magrittr_1.5           checkmate_2.0.0       
##  [13] memoise_1.1.0          cluster_2.1.0          limma_3.42.2          
##  [16] annotate_1.64.0        modelr_0.1.5           jpeg_0.1-8.1          
##  [19] colorspace_1.4-1       blob_1.2.1             rvest_0.3.5           
##  [22] haven_2.2.0            xfun_0.12              crayon_1.3.4          
##  [25] RCurl_1.98-1.1         jsonlite_1.6.1         hexbin_1.28.1         
##  [28] genefilter_1.68.0      survival_3.1-8         glue_1.3.1            
##  [31] gtable_0.3.0           zlibbioc_1.32.0        XVector_0.26.0        
##  [34] scales_1.1.0           DBI_1.1.0              Rcpp_1.0.3            
##  [37] viridisLite_0.3.0      xtable_1.8-4           htmlTable_1.13.3      
##  [40] foreign_0.8-75         bit_1.1-15.2           preprocessCore_1.48.0 
##  [43] Formula_1.2-3          htmlwidgets_1.5.1      httr_1.4.1            
##  [46] acepack_1.4.1          pkgconfig_2.0.3        XML_3.99-0.3          
##  [49] farver_2.0.3           nnet_7.3-12            dbplyr_1.4.2          
##  [52] locfit_1.5-9.1         tidyselect_1.0.0       labeling_0.3          
##  [55] rlang_0.4.4            later_1.0.0            AnnotationDbi_1.48.0  
##  [58] munsell_0.5.0          cellranger_1.1.0       tools_3.6.2           
##  [61] cli_2.0.1              generics_0.0.2         RSQLite_2.2.0         
##  [64] broom_0.5.4            fastmap_1.0.1          evaluate_0.14         
##  [67] yaml_2.2.1             knitr_1.28             bit64_0.9-7           
##  [70] fs_1.3.1               caTools_1.18.0         nlme_3.1-144          
##  [73] mime_0.9               compiler_3.6.2         rstudioapi_0.11       
##  [76] png_0.1-7              testthat_2.3.1         affyio_1.56.0         
##  [79] reprex_0.3.0           geneplotter_1.64.0     stringi_1.4.5         
##  [82] highr_0.8              Matrix_1.2-18          vctrs_0.2.2           
##  [85] pillar_1.4.3           lifecycle_0.1.0        BiocManager_1.30.10   
##  [88] bitops_1.0-6           httpuv_1.5.2           R6_2.4.1              
##  [91] latticeExtra_0.6-29    affy_1.64.0            promises_1.1.0        
##  [94] KernSmooth_2.23-16     gridExtra_2.3          gtools_3.8.1          
##  [97] assertthat_0.2.1       rprojroot_1.3-2        withr_2.1.2           
## [100] GenomeInfoDbData_1.2.2 hms_0.5.3              rpart_4.1-15          
## [103] rmarkdown_2.1          shiny_1.4.0            lubridate_1.7.4       
## [106] base64enc_0.1-3